Number of papers over time

data source

We load the data from the Competence Centre for Bibliometrics: http://www.bibliometrie.info/. They licence access to the Web of Science and Scopus bibliometric databases, spanning a high proportion of all peer-reviewed research literature. The Competence Centre for Bibliometrics further processes both databases' data, so that it can be queried with SQL.

load libraries:



In [1]:

    
import cx_Oracle #ensure that OS, InstantClient (Basic, ODBC, SDK) and cx_Oracle are all 64 bit. Install with "pip install cx_Oracle". Add link to InstantClient in Path variable!
import pandas as pd
import re
import plotly.plotly as py
import plotly.graph_objs as go

set parameter



In [2]:

    
#parameter:
searchterm="big data" #lowecase!
colorlist=["#01be70","#586bd0","#c0aa12","#0183e6","#f69234","#0095e9","#bd8600","#007bbe","#bb7300","#63bcfc","#a84a00","#01bedb","#82170e","#00c586","#a22f1f","#3fbe57","#3e4681","#9bc246","#9a9eec","#778f00","#00aad9","#fc9e5e","#01aec1","#832c1e","#55c99a","#dd715b","#017c1c","#ff9b74","#009556","#83392a","#00b39b","#8e5500","#50a7c6","#f4a268","#02aca7","#532b00","#67c4bd","#5e5500","#f0a18f","#007229","#d2b073","#005d3f","#a5be6b","#2a4100","#8cb88c","#2f5c00","#007463","#5b7200","#787c48","#3b7600"]

load data from SQL database:



In [ ]:

    
dsn_tns=cx_Oracle.makedsn('127.0.0.1','6025',service_name='bibliodb01.fiz.karlsruhe') #due to licence requirements,
# access is only allowed for members of the competence center of bibliometric and cooperation partners. You can still 
# continue with the resulting csv below.
 #open connection:
db=cx_Oracle.connect(<username>, <password>, dsn_tns)
print(db.version)



In [ ]:

    
#%% define sql-query function:
def read_query(connection, query):
    cursor = connection.cursor()
    try:
        cursor.execute( query )
        names = [ x[0] for x in cursor.description]
        rows = cursor.fetchall()
        return pd.DataFrame( rows, columns=names)
    finally:
        if cursor is not None:
            cursor.close()



In [ ]:

    
#%% load paper titles from WOSdb:
database="wos_B_2016"          
            
command="""SELECT DISTINCT(ARTICLE_TITLE), PUBYEAR   
 FROM """+database+""".KEYWORDS, """+database+""".ITEMS_KEYWORDS, """+database+""".ITEMS 
 WHERE
 """+database+""".ITEMS_KEYWORDS.FK_KEYWORDS="""+database+""".KEYWORDS.PK_KEYWORDS
 AND """+database+""".ITEMS.PK_ITEMS="""+database+""".ITEMS_KEYWORDS.FK_ITEMS  
 AND (lower("""+database+""".KEYWORDS.KEYWORD) LIKE '%"""+searchterm+"""%' OR lower(ARTICLE_TITLE) LIKE '%"""+searchterm+"""%')
"""

dfWOS=read_query(db,command)
dfWOS['wos']=True #to make the source identifyable
dfWOS.to_csv("all_big_data_titles_year_wos.csv", sep=';')


#%% load paper titles from SCOPUSdb:
database="SCOPUS_B_2016"            
            
command="""SELECT DISTINCT(ARTICLE_TITLE), PUBYEAR  
 FROM """+database+""".KEYWORDS, """+database+""".ITEMS_KEYWORDS, """+database+""".ITEMS 
 WHERE
 """+database+""".ITEMS_KEYWORDS.FK_KEYWORDS="""+database+""".KEYWORDS.PK_KEYWORDS
 AND """+database+""".ITEMS.PK_ITEMS="""+database+""".ITEMS_KEYWORDS.FK_ITEMS  
 AND (lower("""+database+""".KEYWORDS.KEYWORD) LIKE '%"""+searchterm+"""%' OR lower(ARTICLE_TITLE) LIKE '%"""+searchterm+"""%')
"""

dfSCOPUS=read_query(db,command)
dfSCOPUS['scopus']=True #to make the source identifyable
dfSCOPUS.to_csv("all_big_data_titles_year_scopus.csv", sep=';')

#this takes some time, we will work with the exported CSV from here on

merging data



In [14]:

    
dfWOS=pd.read_csv("all_big_data_titles_year_wos.csv",sep=";")
dfSCOPUS=pd.read_csv("all_big_data_titles_year_scopus.csv",sep=";")

df=pd.merge(dfWOS,dfSCOPUS,on='ARTICLE_TITLE',how='outer')
#get PUBYEAR in one column:
df.loc[df['wos'] == 1, 'PUBYEAR_y'] = df['PUBYEAR_x']
#save resulting csv again:
df=df[['ARTICLE_TITLE','PUBYEAR_y','wos','scopus']]
df.to_csv("all_big_data_titles_with_year.csv", sep=';')
df









    Out[14]:






  
    
      
      ARTICLE_TITLE
      PUBYEAR_y
      wos
      scopus
    
  
  
    
      0
      Big Data with Cloud Computing: an insight on t...
      2014.0
      True
      NaN
    
    
      1
      Understanding Democracy and Development Traps ...
      2015.0
      True
      NaN
    
    
      2
      Psycho-Informatics: Big Data shaping modern ps...
      2014.0
      True
      NaN
    
    
      3
      Keywords co-occurrence mapping knowledge domai...
      2015.0
      True
      NaN
    
    
      4
      Introducing TPCx-HS: The First Industry Standa...
      2015.0
      True
      NaN
    
    
      5
      Application and Exploration of Big Data Mining...
      2016.0
      True
      NaN
    
    
      6
      Performance Evaluation of a Natural Language P...
      2014.0
      True
      NaN
    
    
      7
      Context-aware Task Allocation for Fast Paralle...
      2014.0
      True
      NaN
    
    
      8
      Improving China's Corporate Governance Within ...
      2015.0
      True
      NaN
    
    
      9
      Big Data and Predictive Analytics in ERP Syste...
      2014.0
      True
      NaN
    
    
      10
      Re-Stream: Real-time and energy-efficient reso...
      2015.0
      True
      True
    
    
      11
      BIG DATA IN SURVEY RESEARCH
      2015.0
      True
      NaN
    
    
      12
      Models and Data Sources Used in Systems Medici...
      2016.0
      True
      NaN
    
    
      13
      Big data and precision
      2015.0
      True
      True
    
    
      14
      IoT-Security approach analysis for the novel n...
      2014.0
      True
      NaN
    
    
      15
      A meeting report from the 2013 GARNet workshop...
      2015.0
      True
      NaN
    
    
      16
      Learning methodologies for wireless big data n...
      2016.0
      True
      True
    
    
      17
      Reducing Data Dimensions for Systems Engineeri...
      2014.0
      True
      NaN
    
    
      18
      Twitter Streams Fuel Big Data Approaches to He...
      2015.0
      True
      NaN
    
    
      19
      THE LATENT STATE HAZARD MODEL, WITH APPLICATIO...
      2015.0
      True
      NaN
    
    
      20
      Deploying and Managing a Network of Autonomous...
      2015.0
      True
      NaN
    
    
      21
      The Person-Event Data Environment: leveraging ...
      2013.0
      True
      NaN
    
    
      22
      A secure and scalable storage system for aggre...
      2015.0
      True
      True
    
    
      23
      MaRDiGraS: Simplified Building of Reachability...
      2013.0
      True
      NaN
    
    
      24
      Power System Disaster-Mitigating Dispatch Plat...
      2014.0
      True
      NaN
    
    
      25
      A k-anonymity Method based on SEM (Search Engi...
      2013.0
      True
      NaN
    
    
      26
      Philosophical Reflections on Data
      2014.0
      True
      NaN
    
    
      27
      A Risk and Benefits Behavioral Model to Assess...
      2013.0
      True
      NaN
    
    
      28
      Complications of Laryngeal Masks in Children B...
      2013.0
      True
      NaN
    
    
      29
      From social data mining to forecasting socio-e...
      2011.0
      True
      NaN
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      9449
      Big data analysis and data velocity
      2015.0
      NaN
      True
    
    
      9450
      I/O characteristics and implications of big da...
      2015.0
      NaN
      True
    
    
      9451
      Secure distribution of big data based on bitto...
      2013.0
      NaN
      True
    
    
      9452
      Modern aspects in development of branch applic...
      2015.0
      NaN
      True
    
    
      9453
      Multi-strategy based sina microblog data acqui...
      2014.0
      NaN
      True
    
    
      9454
      A novel Cp-Tree-based co-located classifier fo...
      2015.0
      NaN
      True
    
    
      9455
      Application of big data technology in support ...
      2015.0
      NaN
      True
    
    
      9456
      Real-time effective framework for unstructured...
      2013.0
      NaN
      True
    
    
      9457
      Big Data-Security and Privacy
      2015.0
      NaN
      True
    
    
      9458
      Research on public opinion based on Big Data
      2015.0
      NaN
      True
    
    
      9459
      Locally refined splines representation for geo...
      2015.0
      NaN
      True
    
    
      9460
      Big data study for coping with stress
      2015.0
      NaN
      True
    
    
      9461
      Digital Data Grows into Big Data
      2015.0
      NaN
      True
    
    
      9462
      SAW classification algorithm for Chinese text ...
      2015.0
      NaN
      True
    
    
      9463
      Interactive e-science cyberinfrastructure for ...
      2015.0
      NaN
      True
    
    
      9464
      Public policy considerations for data-driven i...
      2013.0
      NaN
      True
    
    
      9465
      The performance of MapReduce over the varying ...
      2013.0
      NaN
      True
    
    
      9466
      Understanding library user engagement strategi...
      2015.0
      NaN
      True
    
    
      9467
      Twitter Mining for Discovery, Prediction and C...
      2015.0
      NaN
      True
    
    
      9468
      Process optimization and monitoring along big ...
      2015.0
      NaN
      True
    
    
      9469
      RSenter: Terms mining tool from unstructured d...
      2013.0
      NaN
      True
    
    
      9470
      Resource management in cloud federation using ...
      2014.0
      NaN
      True
    
    
      9471
      Design and implementation of a dynamic educati...
      2014.0
      NaN
      True
    
    
      9472
      Big data for cyber physical systems an analysi...
      2014.0
      NaN
      True
    
    
      9473
      Designing a big data processing platform for a...
      2013.0
      NaN
      True
    
    
      9474
      Potential and Pitfalls for Big Data in Health ...
      2015.0
      NaN
      True
    
    
      9475
      GridKa school - Teaching information technolog...
      2015.0
      NaN
      True
    
    
      9476
      A survey on PCM-based big data storage and man...
      2015.0
      NaN
      True
    
    
      9477
      A distributed file system over heterogeneous s...
      2015.0
      NaN
      True
    
    
      9478
      Adaptive collaborative filtering based on scal...
      2016.0
      NaN
      True
    
  

9479 rows × 4 columns

grouping data



In [17]:

    
grouped=df.groupby(['PUBYEAR_y'])           
df2=grouped.agg('count').reset_index()
df2









    Out[17]:






  
    
      
      PUBYEAR_y
      ARTICLE_TITLE
      wos
      scopus
    
  
  
    
      0
      1995.0
      1
      1
      0
    
    
      1
      2003.0
      1
      0
      1
    
    
      2
      2004.0
      1
      1
      0
    
    
      3
      2005.0
      1
      0
      1
    
    
      4
      2006.0
      2
      0
      2
    
    
      5
      2007.0
      1
      1
      0
    
    
      6
      2008.0
      4
      3
      1
    
    
      7
      2009.0
      4
      4
      1
    
    
      8
      2010.0
      7
      4
      3
    
    
      9
      2011.0
      31
      10
      22
    
    
      10
      2012.0
      323
      106
      228
    
    
      11
      2013.0
      1421
      570
      904
    
    
      12
      2014.0
      2652
      1048
      1789
    
    
      13
      2015.0
      4111
      1452
      3127
    
    
      14
      2016.0
      919
      322
      750

visualize with plotly:

we make three diagrams: 1) a horizontal bar plot comparing the overall papers per db 2) a vertical bar plot differentiating time and db 3) a vertical bar plot differentiating tima and db with a logarithmic y-scale (allows for better inspection of smaller numbers)



In [46]:

    
#set data for horizontal bar plot:
data = [go.Bar(
            x=[pd.DataFrame.sum(df2)['wos'],pd.DataFrame.sum(df2)['scopus'],pd.DataFrame.sum(df2)['ARTICLE_TITLE']],
            y=['Web of Science', 'Scopus', 'Total'],
            orientation = 'h',
            marker=dict(
                color=colorlist
            )
)]
#py.plot(data, filename='big_data_papers_horizontal') #for uploading to plotly
py.iplot(data, filename='horizontal-bar')









    Out[46]:





'https://plot.ly/~mathias.riechert/131'



In [47]:

    
#set data for stacked bar plot:
trace1 = go.Bar(
    x=df2['PUBYEAR_y'],
    y=df2['wos'],
    name='Web of Science',
    marker=dict(
    color=colorlist[0]
        )
)
trace2 = go.Bar(
    x=df2['PUBYEAR_y'],
    y=df2['scopus'],
    name='Scopus',
    marker=dict(
    color=colorlist[1]
        )

)
trace3 = go.Bar(
    x=df2['PUBYEAR_y'],
    y=df2['ARTICLE_TITLE'],
    name='All Papers',
    marker=dict(
    color=colorlist[2]
        )
)
data = [trace1, trace2,trace3]



In [54]:

    
#set layout for stacked bar chart with logarithmic y scale:

#set layout for stacked bar chart with normal y scale:
layout_no_log = go.Layout(
    title='Big data papers over time',
    barmode='group',
    xaxis=dict(
        title='year',
        titlefont=dict(
            family='Arial, sans-serif',
            size=14,
            color='lightgrey'
        ),
        tickfont=dict(
            family='Arial, sans-serif',
            size=10,
            color='black'
        ),
        showticklabels=True,
        dtick=1,
        tickangle=45,
    )
)
#plot:
fig1 = go.Figure(data=data, layout=layout_no_log)
py.iplot(fig1, filename='big_data_papers_no_log')



In [44]:

    
layout_log = go.Layout(
    title='Big data papers over time (log y-scale)',
    barmode='group',
    xaxis=dict(
        title='year',
        titlefont=dict(
            family='Arial, sans-serif',
            size=14,
            color='lightgrey'
        ),
        tickfont=dict(
            family='Arial, sans-serif',
            size=10,
            color='black'
        ),
        showticklabels=True,
        dtick=1,
        tickangle=45,
    ),
    yaxis=dict(
        type='log'
    )
    )
fig2 = go.Figure(data=data, layout=layout_log)
py.iplot(fig2, filename='big_data_papers_log')









    Out[44]:

	ARTICLE_TITLE	PUBYEAR_y	wos	scopus
0	Big Data with Cloud Computing: an insight on t...	2014.0	True	NaN
1	Understanding Democracy and Development Traps ...	2015.0	True	NaN
2	Psycho-Informatics: Big Data shaping modern ps...	2014.0	True	NaN
3	Keywords co-occurrence mapping knowledge domai...	2015.0	True	NaN
4	Introducing TPCx-HS: The First Industry Standa...	2015.0	True	NaN
5	Application and Exploration of Big Data Mining...	2016.0	True	NaN
6	Performance Evaluation of a Natural Language P...	2014.0	True	NaN
7	Context-aware Task Allocation for Fast Paralle...	2014.0	True	NaN
8	Improving China's Corporate Governance Within ...	2015.0	True	NaN
9	Big Data and Predictive Analytics in ERP Syste...	2014.0	True	NaN
10	Re-Stream: Real-time and energy-efficient reso...	2015.0	True	True
11	BIG DATA IN SURVEY RESEARCH	2015.0	True	NaN
12	Models and Data Sources Used in Systems Medici...	2016.0	True	NaN
13	Big data and precision	2015.0	True	True
14	IoT-Security approach analysis for the novel n...	2014.0	True	NaN
15	A meeting report from the 2013 GARNet workshop...	2015.0	True	NaN
16	Learning methodologies for wireless big data n...	2016.0	True	True
17	Reducing Data Dimensions for Systems Engineeri...	2014.0	True	NaN
18	Twitter Streams Fuel Big Data Approaches to He...	2015.0	True	NaN
19	THE LATENT STATE HAZARD MODEL, WITH APPLICATIO...	2015.0	True	NaN
20	Deploying and Managing a Network of Autonomous...	2015.0	True	NaN
21	The Person-Event Data Environment: leveraging ...	2013.0	True	NaN
22	A secure and scalable storage system for aggre...	2015.0	True	True
23	MaRDiGraS: Simplified Building of Reachability...	2013.0	True	NaN
24	Power System Disaster-Mitigating Dispatch Plat...	2014.0	True	NaN
25	A k-anonymity Method based on SEM (Search Engi...	2013.0	True	NaN
26	Philosophical Reflections on Data	2014.0	True	NaN
27	A Risk and Benefits Behavioral Model to Assess...	2013.0	True	NaN
28	Complications of Laryngeal Masks in Children B...	2013.0	True	NaN
29	From social data mining to forecasting socio-e...	2011.0	True	NaN
...	...	...	...	...
9449	Big data analysis and data velocity	2015.0	NaN	True
9450	I/O characteristics and implications of big da...	2015.0	NaN	True
9451	Secure distribution of big data based on bitto...	2013.0	NaN	True
9452	Modern aspects in development of branch applic...	2015.0	NaN	True
9453	Multi-strategy based sina microblog data acqui...	2014.0	NaN	True
9454	A novel Cp-Tree-based co-located classifier fo...	2015.0	NaN	True
9455	Application of big data technology in support ...	2015.0	NaN	True
9456	Real-time effective framework for unstructured...	2013.0	NaN	True
9457	Big Data-Security and Privacy	2015.0	NaN	True
9458	Research on public opinion based on Big Data	2015.0	NaN	True
9459	Locally refined splines representation for geo...	2015.0	NaN	True
9460	Big data study for coping with stress	2015.0	NaN	True
9461	Digital Data Grows into Big Data	2015.0	NaN	True
9462	SAW classification algorithm for Chinese text ...	2015.0	NaN	True
9463	Interactive e-science cyberinfrastructure for ...	2015.0	NaN	True
9464	Public policy considerations for data-driven i...	2013.0	NaN	True
9465	The performance of MapReduce over the varying ...	2013.0	NaN	True
9466	Understanding library user engagement strategi...	2015.0	NaN	True
9467	Twitter Mining for Discovery, Prediction and C...	2015.0	NaN	True
9468	Process optimization and monitoring along big ...	2015.0	NaN	True
9469	RSenter: Terms mining tool from unstructured d...	2013.0	NaN	True
9470	Resource management in cloud federation using ...	2014.0	NaN	True
9471	Design and implementation of a dynamic educati...	2014.0	NaN	True
9472	Big data for cyber physical systems an analysi...	2014.0	NaN	True
9473	Designing a big data processing platform for a...	2013.0	NaN	True
9474	Potential and Pitfalls for Big Data in Health ...	2015.0	NaN	True
9475	GridKa school - Teaching information technolog...	2015.0	NaN	True
9476	A survey on PCM-based big data storage and man...	2015.0	NaN	True
9477	A distributed file system over heterogeneous s...	2015.0	NaN	True
9478	Adaptive collaborative filtering based on scal...	2016.0	NaN	True

	PUBYEAR_y	ARTICLE_TITLE	wos	scopus
0	1995.0	1	1	0
1	2003.0	1	0	1
2	2004.0	1	1	0
3	2005.0	1	0	1
4	2006.0	2	0	2
5	2007.0	1	1	0
6	2008.0	4	3	1
7	2009.0	4	4	1
8	2010.0	7	4	3
9	2011.0	31	10	22
10	2012.0	323	106	228
11	2013.0	1421	570	904
12	2014.0	2652	1048	1789
13	2015.0	4111	1452	3127
14	2016.0	919	322	750